#if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
#remotes::install_version("Seurat", version = "4.3.0", lib = "~/bin/seurat_4.3.0")
#install.packages("ggplot2")
#install.packages("dplyr")
#BiocManager::install("glmGamPoi")
#install.packages("sctransform")
#install.packages("purrr")
#install.packages("HGNChelper")
#install.packages("openxlsx")
library(Seurat, lib.loc = "~/bin/seurat_4.3.0")
## Attaching SeuratObject
## Seurat v4 was just loaded with SeuratObject v5; disabling v5 assays and
## validation routines, and ensuring assays work in strict v3/v4
## compatibility mode
library(sctransform)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(HGNChelper)
library(openxlsx)
library(stringr)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:data.table':
##
## transpose
#Load control and Cal09 infected gene expression matrices (output by CellRanger)
mock.data <- Read10X(data.dir = "")
cal.data <- Read10X(data.dir = "")
mock <- CreateSeuratObject(counts = mock.data,
min.features = 200,
project = "mock")
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
cal <- CreateSeuratObject(counts = cal.data,
min.features = 200,
project = "cal")
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
#Add a column which describes the experimental condition for each dataset
mock[["condition"]] <- "mock"
cal[["condition"]] <- "cal"
#Merge two Seurat objects
merged_seurat <- merge(x = mock,
y = cal,
add.cell.id = c("mock", "cal"))
#Calculate mitochondrial genes
merged_seurat[["percent.mt"]] <- PercentageFeatureSet(merged_seurat, pattern = "^MT-")
#Calculate number of genes detected per UMI (give a sense of data complexity)
merged_seurat[["log10GenesPerUMI"]] <- log10(merged_seurat$nFeature_RNA) / log10(merged_seurat$nCount_RNA)
#Create column that replicates the rownames
merged_seurat[["cells"]] <- rownames(merged_seurat@meta.data)
Cells captured per condition:
merged_seurat@meta.data %>%
ggplot(aes(x=condition, fill=condition)) +
geom_bar() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
ggtitle("Number of cells per condition")
UMI counts per cell:
merged_seurat@meta.data %>%
ggplot(aes(color=condition, x=nCount_RNA, fill= condition)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
theme_classic() +
ylab("Cell density") +
geom_vline(xintercept = 500, linetype = "dotted") +
geom_vline(xintercept = 100000, linetype = "dotted") +
ggtitle("Number of UMIs per cell")
Number of genes per cell:
merged_seurat@meta.data %>%
ggplot(aes(color=condition, x=nFeature_RNA, fill= condition)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
ggtitle("Genes detected per cell") +
geom_vline(xintercept = 600, linetype = "dotted")
Bimodal distribution is typical of NHBE cells.
Number of genes per UMI:
merged_seurat@meta.data %>%
ggplot(aes(color=condition, x=log10GenesPerUMI, fill= condition)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
geom_vline(xintercept = 0.8, linetype = "dotted") +
ggtitle("Average genes per UMI\n(Complexity metric)")
Histogram of the mitochondrial gene counts:
merged_seurat@meta.data %>%
ggplot(aes(color=condition, x=percent.mt, fill=condition)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
theme_classic() +
geom_vline(xintercept = 40, linetype = "dotted") +
ggtitle("Percent mitochondrial gene counts")
Higher mitochondrial gene percentages is also typical of NHBE cells. Assess cutoffs of UMIs between 500 and 100,000 and number of unique features/transcripts at 700:
merged_seurat@meta.data %>%
ggplot(aes(x=nCount_RNA, y=nFeature_RNA, color=percent.mt)) +
geom_point() +
scale_colour_gradient(low = "gray90", high = "black") +
#scale_colour_gradientn(colours = "white", "white", "gray", "black",
# breaks = c(0, ))
stat_smooth(method=lm) +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = 500, linetype = "dotted") +
geom_vline(xintercept = 100000, linetype = "dotted") +
geom_hline(yintercept = 600, linetype = "dotted") +
facet_wrap(~condition)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
After applying above filters, plus a filter for MT% greater than 40%. Look at violin plots of the main QC metrics after filtering:
filtered_merged_seurat <- subset(merged_seurat,
subset = (nCount_RNA > 500) &
(nCount_RNA < 100000) &
(nFeature_RNA > 600) &
(log10GenesPerUMI > 0.8) &
(percent.mt < 40))
VlnPlot(filtered_merged_seurat, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "condition")
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
seurat_phase <- NormalizeData(filtered_merged_seurat) #Normalize the data
seurat_phase <- FindVariableFeatures(seurat_phase, selection.method = "vst") #Find variable genes
seurat_phase <- ScaleData(seurat_phase) #Scale the data
## Centering and scaling data matrix
seurat_phase <- RunPCA(seurat_phase) # Perform PCA on the variable genes
## PC_ 1
## Positive: S100A2, S100A9, KRT5, KRT17, KRT6A, SERPINB3, LY6D, FABP5, TIMP1, KRT13
## SPINK5, SFN, FRMD6, CCND1, DSG3, AGR2, S100A8, IGFBP6, PHLDB2, TUBB
## KRT15, MMP28, TP63, FLNA, IL1RN, KRT14, TNC, COL17A1, LGALS7, RBP1
## Negative: AGBL4, DNAH11, DCDC1, CFAP54, DNAH3, WDR49, SERPINI2, HYDIN, VWA3A, CFAP299
## CDHR3, DNAH9, DNAH7, LMNTD1, RP1, DNAH12, DNAH6, PACRG, DNAI1, ADGB
## FAM227A, CFAP43, SPEF2, DNAAF1, CFAP61, ARMC3, ZBBX, CFAP46, STK33, TTC29
## PC_ 2
## Positive: KRT5, S100A2, COL17A1, FLNA, IGFBP6, SFN, BCAM, LAMA3, TP63, S100A10
## TENM2, DST, CCDC3, FRMD6, KRT17, C16orf74, CAV1, IGFBP7, ANKH, KRT15
## RBBP8, KRT6A, TUBB6, CALD1, FHOD3, TIMP1, CDH13, MMP28, COL4A6, KRT14
## Negative: CD55, WFDC2, LCN2, BPIFB1, C3, CP, MUC16, PIGR, SLPI, TMC5
## SLC6A14, CXCL17, ABCA13, FAM3D, PLEKHS1, MUC1, ANKRD36C, RIMS1, PPP1R9A, CATSPERB
## FER1L6, FCGBP, MSMB, CAPN13, ARHGEF38, MUC5B, CEACAM6, MUC20, MACC1, S100P
## PC_ 3
## Positive: PARD3B, FTO, RARB, SORBS2, ITGB8, LAMC2, STK38L, FMNL2, GPRC5A, RNF213
## HDAC9, ANTXR1, RIMS2, ITPKC, DST, PTK2B, INPP4B, HMGA2, CD55, TNS3
## CATSPERB, WDPCP, TTC6, KLF7, NEDD9, PLEKHG1, ERBB4, ATP10B, NBEA, B4GALNT3
## Negative: FTH1, DYNLL1, CSTB, IFI27, TUBB4B, S100A9, CAPS, MORN2, DYNLT1, C20orf85
## SCGB1A1, SCGB3A1, SLPI, SERPINB3, LCN2, C9orf24, TUBA1A, CRIP1, PIERCE1, TMEM190
## HSP90AA1, WFDC2, PIFO, TPPP3, SAA1, UFC1, CFAP276, CAPSL, OMG, CETN2
## PC_ 4
## Positive: TOP2A, DIAPH3, MKI67, CENPF, ASPM, RRM2, MELK, ANLN, TPX2, KNL1
## NCAPG, KIF14, KIF18B, GTSE1, PBK, ENSG00000284906, CEP55, CENPE, NCAPH, CDK1
## SPC25, KIF15, CKAP2L, BRCA2, ESCO2, KIF4A, BIRC5, TYMS, NDC80, TACC3
## Negative: IGFBP7, DST, COL17A1, CCND2, TNS3, BCAM, IGFBP3, PTPRQ, PALMD, LOXL4
## RBBP8, SOX6, TENM2, MMP13, ANKH, COL12A1, FYB1, FHOD3, LAMA3, PHLDA1
## COL4A6, PMEPA1, SLC7A2, TINAGL1, ABCC3, CDH13, CLU, ADGRL3, TNC, CCDC3
## PC_ 5
## Positive: SERPINB3, KRT13, KRT4, LY6D, SNX31, SERPINB1, SERPINB13, LYPD3, CLCA4, SPINK5
## PLCE1, PLAUR, RNF168, DSG3, GSTA1, LBH, AQP5, MDM2, ABTB3, GPX2
## CADM1, NR4A3, POU2AF1, SMIM35, SCEL, CCDC60, STRBP, ITPKC, IGSF11, RERG
## Negative: SAA1, IFIT3, SAA2, IFIT1, IFI6, ISG15, IDO1, IFITM1, HLA-DRB1, SLC26A4
## MX2, DDX60L, MX1, MT2A, EPSTI1, IFIT2, HERC6, HLA-DPA1, RSAD2, SAMD9L
## HLA-DPB1, DDX60, USP18, CXCL6, HERC5, SLC5A8, RNF213, HLA-DQA1, HLA-DRA, CXCL5
seurat_phase <- CellCycleScoring(seurat_phase,
g2m.features = g2m.genes,
s.features = s.genes)
## Warning: The following features are not present in the object: MLF1IP, not
## searching for symbol synonyms
## Warning: The following features are not present in the object: FAM64A, HN1, not
## searching for symbol synonyms
PCA colored by cell cycle phase:
DimPlot(seurat_phase,
reduction = "pca",
group.by= "Phase")
The left side of the graph is dominated by G1 and G2M cells. I will regress out variation due to cell cycle genes in the next step.
Do the same PCA for mitochondrial gene expression
#Get distribution of MT gene expression:
summary(seurat_phase@meta.data$percent.mt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1665 8.4534 14.0768 14.0511 18.8004 39.9950
#Based on these values, create a new variable to bin cells based on level of MT gene expression
seurat_phase@meta.data$mitoFr <- cut(seurat_phase@meta.data$percent.mt,
breaks=c(-Inf, 8.4534, 14.0768, 18.8004, Inf),
labels=c("Low","Medium","Medium high", "High"))
Plot PCA and color according to their MT gene expression “bin”.
DimPlot(seurat_phase,
size=10,
reduction = "pca",
group.by = "mitoFr")
There are differences on the PC_1, especially in low MT-expressing cells, so I will regress this variation out as well.
#Split the object into mock and cal conditions
split_seurat <- SplitObject(seurat_phase, split.by = "condition")
#Perform SCtransformation for the mock dataset first
mock <- SCTransform(split_seurat[["mock"]],
vst.flavor = "v2",
vars.to.regress = c("percent.mt", "S.Score", "G2M.Score"))
## vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.
## Calculating cell attributes from input UMI matrix: log_umi
## Total Step 1 genes: 16548
## Total overdispersed genes: 15496
## Excluding 1052 genes from Step 1 because they are not overdispersed.
## Variance stabilizing transformation of count matrix of size 17313 by 12610
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 5000 cells
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
## Setting estimate of 173 genes to inf as theta_mm/theta_mle < 1e-3
## # of step1 poisson genes (variance < mean): 0
## # of low mean genes (mean < 0.001): 826
## Total # of Step1 poisson genes (theta=Inf; variance < mean): 182
## Total # of poisson genes (theta=Inf; variance < mean): 1788
## Calling offset model for all 1788 poisson genes
## Found 197 outliers - those will be ignored in fitting/regularization step
## Ignoring theta inf genes
## Replacing fit params for 1788 poisson genes by theta=Inf
## Setting min_variance based on median UMI: 0.04
## Second step: Get residuals using fitted parameters for 17313 genes
##
|
| | 0%
|
|== | 3%
|
|==== | 6%
|
|====== | 9%
|
|======== | 11%
|
|========== | 14%
|
|============ | 17%
|
|============== | 20%
|
|================ | 23%
|
|================== | 26%
|
|==================== | 29%
|
|====================== | 31%
|
|======================== | 34%
|
|========================== | 37%
|
|============================ | 40%
|
|============================== | 43%
|
|================================ | 46%
|
|================================== | 49%
|
|==================================== | 51%
|
|====================================== | 54%
|
|======================================== | 57%
|
|========================================== | 60%
|
|============================================ | 63%
|
|============================================== | 66%
|
|================================================ | 69%
|
|================================================== | 71%
|
|==================================================== | 74%
|
|====================================================== | 77%
|
|======================================================== | 80%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|============================================================== | 89%
|
|================================================================ | 91%
|
|================================================================== | 94%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Computing corrected count matrix for 17313 genes
##
|
| | 0%
|
|== | 3%
|
|==== | 6%
|
|====== | 9%
|
|======== | 11%
|
|========== | 14%
|
|============ | 17%
|
|============== | 20%
|
|================ | 23%
|
|================== | 26%
|
|==================== | 29%
|
|====================== | 31%
|
|======================== | 34%
|
|========================== | 37%
|
|============================ | 40%
|
|============================== | 43%
|
|================================ | 46%
|
|================================== | 49%
|
|==================================== | 51%
|
|====================================== | 54%
|
|======================================== | 57%
|
|========================================== | 60%
|
|============================================ | 63%
|
|============================================== | 66%
|
|================================================ | 69%
|
|================================================== | 71%
|
|==================================================== | 74%
|
|====================================================== | 77%
|
|======================================================== | 80%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|============================================================== | 89%
|
|================================================================ | 91%
|
|================================================================== | 94%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Calculating gene attributes
## Wall clock passed: Time difference of 1.691986 mins
## Determine variable features
## Place corrected count matrix in counts slot
## Regressing out percent.mt, S.Score, G2M.Score
## Centering data matrix
## Set default assay to SCT
#Then for the cal-infected dataset
cal <- SCTransform(split_seurat[["cal"]],
vst.flavor = "v2",
vars.to.regress = c("percent.mt", "S.Score", "G2M.Score"))
## vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.
## Calculating cell attributes from input UMI matrix: log_umi
## Total Step 1 genes: 17139
## Total overdispersed genes: 16338
## Excluding 801 genes from Step 1 because they are not overdispersed.
## Variance stabilizing transformation of count matrix of size 18033 by 17955
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 5000 cells
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
## Setting estimate of 197 genes to inf as theta_mm/theta_mle < 1e-3
## # of step1 poisson genes (variance < mean): 0
## # of low mean genes (mean < 0.001): 897
## Total # of Step1 poisson genes (theta=Inf; variance < mean): 209
## Total # of poisson genes (theta=Inf; variance < mean): 1645
## Calling offset model for all 1645 poisson genes
## Found 211 outliers - those will be ignored in fitting/regularization step
## Ignoring theta inf genes
## Replacing fit params for 1645 poisson genes by theta=Inf
## Setting min_variance based on median UMI: 0.16
## Second step: Get residuals using fitted parameters for 18033 genes
##
|
| | 0%
|
|== | 3%
|
|==== | 5%
|
|====== | 8%
|
|======== | 11%
|
|========= | 14%
|
|=========== | 16%
|
|============= | 19%
|
|=============== | 22%
|
|================= | 24%
|
|=================== | 27%
|
|===================== | 30%
|
|======================= | 32%
|
|========================= | 35%
|
|========================== | 38%
|
|============================ | 41%
|
|============================== | 43%
|
|================================ | 46%
|
|================================== | 49%
|
|==================================== | 51%
|
|====================================== | 54%
|
|======================================== | 57%
|
|========================================== | 59%
|
|============================================ | 62%
|
|============================================= | 65%
|
|=============================================== | 68%
|
|================================================= | 70%
|
|=================================================== | 73%
|
|===================================================== | 76%
|
|======================================================= | 78%
|
|========================================================= | 81%
|
|=========================================================== | 84%
|
|============================================================= | 86%
|
|============================================================== | 89%
|
|================================================================ | 92%
|
|================================================================== | 95%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Computing corrected count matrix for 18033 genes
##
|
| | 0%
|
|== | 3%
|
|==== | 5%
|
|====== | 8%
|
|======== | 11%
|
|========= | 14%
|
|=========== | 16%
|
|============= | 19%
|
|=============== | 22%
|
|================= | 24%
|
|=================== | 27%
|
|===================== | 30%
|
|======================= | 32%
|
|========================= | 35%
|
|========================== | 38%
|
|============================ | 41%
|
|============================== | 43%
|
|================================ | 46%
|
|================================== | 49%
|
|==================================== | 51%
|
|====================================== | 54%
|
|======================================== | 57%
|
|========================================== | 59%
|
|============================================ | 62%
|
|============================================= | 65%
|
|=============================================== | 68%
|
|================================================= | 70%
|
|=================================================== | 73%
|
|===================================================== | 76%
|
|======================================================= | 78%
|
|========================================================= | 81%
|
|=========================================================== | 84%
|
|============================================================= | 86%
|
|============================================================== | 89%
|
|================================================================ | 92%
|
|================================================================== | 95%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Calculating gene attributes
## Wall clock passed: Time difference of 2.328911 mins
## Determine variable features
## Place corrected count matrix in counts slot
## Regressing out percent.mt, S.Score, G2M.Score
## Centering data matrix
## Set default assay to SCT
mock_cal <- list(mock = mock, cal = cal)
integ_features <- SelectIntegrationFeatures(object.list = mock_cal,
nfeatures = 3000)
# Prepare the SCT list object for integration
mock_cal <- PrepSCTIntegration(object.list = mock_cal,
anchor.features = integ_features)
#Finally perform the actual integration (uses what's called canonical correlation analysis)
integ_anchors <- FindIntegrationAnchors(object.list = mock_cal,
normalization.method = "SCT",
anchor.features = integ_features)
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 32456 anchors
## Filtering anchors
## Retained 24496 anchors
seurat_integrated <- IntegrateData(anchorset = integ_anchors,
normalization.method = "SCT")
## Merging dataset 1 into 2
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
#Perform PCA
seurat_integrated <- RunPCA(object = seurat_integrated)
## PC_ 1
## Positive: S100A2, KRT5, S100A9, IGFBP6, KRT17, KRT6A, KRT15, KRT13, SERPINB3, LY6D
## RPS18, RPS12, KRT19, TPT1, RPLP1, KRT14, S100A8, RPS2, FABP5, RPL10
## S100A14, TIMP1, EEF1A1, RPL13, CSTA, SFN, HSPB1, TMSB10, PHLDB2, FGFBP1
## Negative: CFAP299, DNAH12, DNAH7, WDR49, AGBL4, CFAP54, HYDIN, LRRIQ1, DNAH6, DNAH5
## DCDC1, DNAH3, SPAG17, TMEM232, ARMC3, PACRG, DNAH11, CFAP47, DNAH9, NEK10
## CDHR3, RFX3, SPATA17, RP1, CFAP43, SERPINI2, ZBBX, ULK4, STK33, FANK1
## PC_ 2
## Positive: S100A2, IGFBP6, KRT14, KRT15, KRT5, LAMA3, KRT17, IGFBP7, DST, KRT6A
## COL17A1, S100A10, SFN, TIMP1, MT2A, LGALS1, TENM2, CCDC3, CAV1, PHLDB2
## BCAM, TP63, FABP5, RBBP8, FLNA, KRT13, CDH13, FRMD6, NRG1, TINAGL1
## Negative: ANKRD36C, BPIFB1, CD55, PIGR, FCGBP, RIMS1, MUC5B, LCN2, RUNX1, ABCA13
## CP, PLAC8, WFDC2, MACC1, CAPN13, CEACAM6, TMC5, SLC6A14, MUC16, C3
## FER1L6, SLPI, MSMB, PLEKHS1, RARRES1, MUC4, SCGB3A1, GPRC5A, HDAC9, NEBL
## PC_ 3
## Positive: SERPINB3, S100A9, LCN2, LY6D, SLPI, WFDC2, GSTA1, SAA2, SERPINB1, TSPAN1
## CAPS, SCGB3A1, KRT4, SMIM22, MORN2, SERPINB4, AQP5, PI3, CYP4B1, CRIP1
## C9orf24, KRT13, C20orf85, S100A6, CSTA, PIERCE1, UPK1B, PIFO, PSCA, TUBA1A
## Negative: DST, IGFBP6, LAMA3, KRT14, ANKRD36C, COL17A1, KRT15, TENM2, RBBP8, CCDC3
## ITGB1, LGALS1, CAV1, LAMC2, GRIP1, CDH13, TNC, BCAM, TNS3, NRG1
## CBLB, FLNA, KRT17, LAMB3, TP63, ANKH, S100A2, MT2A, TINAGL1, FHOD3
## PC_ 4
## Positive: KRT13, LY6D, KRT4, SERPINB3, SERPINB1, CHST9, DENND2C, CSTA, SPINK5, SERPINB13
## MAPK10, PLAUR, LBH, DSG3, RNF168, AQP3, GSTA1, MYO1E, MTSS1, DSP
## S100A9, CLCA4, PLCE1, MAP3K8, ITPKC, RAB11FIP1, LYPD3, MDM2, ZNF750, SERPINB4
## Negative: SAA2, SAA1, BPIFB1, PIGR, LCN2, FCGBP, RARRES1, SLC26A4, IDO1, MSMB
## CEACAM6, MUC5B, CXCL6, C3, WFDC2, DEFB4A, MT2A, HLA-DRB1, IGFBP6, SLPI
## PROM1, KRT14, IGFBP7, LYPD2, SCGB3A1, HLA-DRA, HLA-DPA1, HLA-DPB1, DST, CXCL5
## PC_ 5
## Positive: OMG, TMEM190, IGFBP7, C9orf24, RRAD, CRIP1, MORN2, GPRC5A, CFAP276, ODF3B
## C20orf85, FAM216B, PIERCE1, MACC1, RIIAD1, PLAC8, MKI67, CETN2, SNTN, WDR38
## NWD1, PIFO, PLAAT2, ANKFN1, PLAUR, TUBA1A, C11orf97, CAPSL, CTXN1, MUC16
## Negative: CDC20B, BPIFB1, CCNO, SCGB3A1, FOXN4, CEP128, SMIM35, ANLN, SCGB1A1, SLPI
## LCN2, PLK4, E2F7, CCDC171, DEUP1, MSMB, CEP112, PLCE1, KIF24, PCM1
## SLC4A8, WFDC2, CROCC2, STRBP, STIL, HES6, SDK1, H2BC11, PIGR, PI3
# Run UMAP
seurat_integrated <- RunUMAP(seurat_integrated,
dims = 1:40,
reduction = "pca")
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 16:39:26 UMAP embedding parameters a = 0.9922 b = 1.112
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## 16:39:26 Read 30565 rows and found 40 numeric columns
## 16:39:26 Using Annoy for neighbor search, n_neighbors = 30
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## 16:39:26 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 16:39:30 Writing NN index file to temp file /tmp/Rtmpnxirbj/file2cf0af6f206328
## 16:39:30 Searching Annoy index using 1 thread, search_k = 3000
## 16:39:43 Annoy recall = 100%
## 16:39:45 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 16:39:47 Initializing from normalized Laplacian + noise (using irlba)
## 16:39:48 Commencing optimization for 200 epochs, with 1358062 positive edges
## 16:40:20 Optimization finished
# Plot UMAP
DimPlot(seurat_integrated, group.by = "condition")
seurat_integrated <- FindNeighbors(object = seurat_integrated,
dims = 1:40)
## Computing nearest neighbor graph
## Computing SNN
seurat_integrated <- FindClusters(object = seurat_integrated,
resolution = 0.2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 30565
## Number of edges: 1162079
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9481
## Number of communities: 12
## Elapsed time: 4 seconds
#Change factor levels of seurat integrated conditions:
seurat_integrated@meta.data$condition <- factor(seurat_integrated@meta.data$condition, levels = c("mock", "cal"))
DimPlot(seurat_integrated,
reduction = "umap",
label = TRUE,
label.size = 6,
split.by = "condition") &
ggtitle("UMAP of integrated data-by cluster") &
labs(caption = "Resolution = 0.2")
#Reset default assay
DefaultAssay(seurat_integrated) <- "RNA"
#Calculate percent flu genes
seurat_integrated[["percent.flu"]] <- PercentageFeatureSet(seurat_integrated, pattern = "Cal-")
summary(seurat_integrated@meta.data$percent.flu)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.01811 0.10368 0.04241 43.25438
#Create bins for low, medium, or highly flu-infected cells.
seurat_integrated@meta.data$flu_fraction <- cut(seurat_integrated@meta.data$percent.flu,
breaks=c(-Inf, 0.07, 1, 10, Inf),
labels=c("Uninfected","Low","Medium", "High"))
mycols <- c("lightgray", "#FFFF99", "orange", "red")
DimPlot(seurat_integrated,
size=10,
reduction = "umap",
group.by = "flu_fraction",
split.by = "condition",
cols = alpha(mycols,0.66)) &
ggtitle("Fraction of flu reads per cell\nCombined analysis") &
labs(caption = "Uninfected: < 0.07% flu\nLow: 0.07-1% flu\nMedium: 1-10% flu\nHigh: >10% flu")
Find conserved markers in each cluster:
# Create function to get conserved markers for any given cluster. This works similarly to FindAllMarkers but uses the correct analysis with the integrated dataset.
get_conserved <- function(cluster){
FindConservedMarkers(seurat_integrated,
assay = "RNA",
ident.1 = cluster,
grouping.var = "condition",
only.pos = TRUE) %>%
tibble::rownames_to_column(var = "gene") %>%
cbind(cluster_id = cluster, .)
}
#Get markers for all 12 clusters in the integrated dataset
conserved_markers <- map_dfr(c(0:11), get_conserved)
## Testing group mock: (0) vs (3, 1, 2, 6, 4, 7, 5, 10, 11, 8, 9)
## Testing group cal: (0) vs (4, 5, 1, 2, 3, 8, 6, 9, 11, 7, 10)
## Testing group mock: (1) vs (0, 3, 2, 6, 4, 7, 5, 10, 11, 8, 9)
## Testing group cal: (1) vs (4, 5, 0, 2, 3, 8, 6, 9, 11, 7, 10)
## Testing group mock: (2) vs (0, 3, 1, 6, 4, 7, 5, 10, 11, 8, 9)
## Testing group cal: (2) vs (4, 5, 1, 0, 3, 8, 6, 9, 11, 7, 10)
## Testing group mock: (3) vs (0, 1, 2, 6, 4, 7, 5, 10, 11, 8, 9)
## Testing group cal: (3) vs (4, 5, 1, 0, 2, 8, 6, 9, 11, 7, 10)
## Testing group mock: (4) vs (0, 3, 1, 2, 6, 7, 5, 10, 11, 8, 9)
## Testing group cal: (4) vs (5, 1, 0, 2, 3, 8, 6, 9, 11, 7, 10)
## Testing group mock: (5) vs (0, 3, 1, 2, 6, 4, 7, 10, 11, 8, 9)
## Testing group cal: (5) vs (4, 1, 0, 2, 3, 8, 6, 9, 11, 7, 10)
## Testing group mock: (6) vs (0, 3, 1, 2, 4, 7, 5, 10, 11, 8, 9)
## Testing group cal: (6) vs (4, 5, 1, 0, 2, 3, 8, 9, 11, 7, 10)
## Testing group mock: (7) vs (0, 3, 1, 2, 6, 4, 5, 10, 11, 8, 9)
## Testing group cal: (7) vs (4, 5, 1, 0, 2, 3, 8, 6, 9, 11, 10)
## Testing group mock: (8) vs (0, 3, 1, 2, 6, 4, 7, 5, 10, 11, 9)
## Testing group cal: (8) vs (4, 5, 1, 0, 2, 3, 6, 9, 11, 7, 10)
## Testing group mock: (9) vs (0, 3, 1, 2, 6, 4, 7, 5, 10, 11, 8)
## Testing group cal: (9) vs (4, 5, 1, 0, 2, 3, 8, 6, 11, 7, 10)
## Testing group mock: (10) vs (0, 3, 1, 2, 6, 4, 7, 5, 11, 8, 9)
## Testing group cal: (10) vs (4, 5, 1, 0, 2, 3, 8, 6, 9, 11, 7)
## Testing group mock: (11) vs (0, 3, 1, 2, 6, 4, 7, 5, 10, 8, 9)
## Testing group cal: (11) vs (4, 5, 1, 0, 2, 3, 8, 6, 9, 7, 10)
cluster_marker_list <- conserved_markers %>%
filter(minimump_p_val < 0.05) %>%
group_by(cluster_id) %>%
arrange(desc(mock_avg_log2FC), .by_group = TRUE)
#Load scripts from SCType github
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/gene_sets_prepare.R"); source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/sctype_score_.R")
# load auto-detection function
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/auto_detect_tissue_type.R")
db_ = "https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/ScTypeDB_full.xlsx";
#Prepare cell type marker database from scType
tissue = "Lung" # e.g. Immune system,Pancreas,Liver,Eye,Kidney,Brain,Lung,Adrenal,Heart,Intestine,Muscle,Placenta,Spleen,Stomach,Thymus
# prepare gene sets
gs_list = gene_sets_prepare(db_, tissue)
# get cell-type by cell matrix
es.max = sctype_score(scRNAseqData = seurat_integrated[["integrated"]]@scale.data,
scaled = TRUE,
gs = gs_list$gs_positive,
gs2 = gs_list$gs_negative)
# merge by cluster
cL_resutls = do.call("rbind", lapply(unique(seurat_integrated@meta.data$seurat_clusters), function(cl){
es.max.cl = sort(rowSums(es.max[ ,rownames(seurat_integrated@meta.data[seurat_integrated@meta.data$seurat_clusters==cl, ])]), decreasing = !0)
head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum(seurat_integrated@meta.data$seurat_clusters==cl)), 10)
}))
sctype_scores = cL_resutls %>% group_by(cluster) %>% top_n(n = 1, wt = scores)
# set low-confident (low ScType score) clusters to "unknown"
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] = "Unknown"
sctype_scores[,1:3]
## # A tibble: 12 × 3
## # Groups: cluster [12]
## cluster type scores
## <fct> <chr> <dbl>
## 1 0 Airway epithelial cells 6244.
## 2 3 Airway goblet cells 23243.
## 3 1 Basal cells (Airway progenitor cells) 25648.
## 4 2 Clara cells 5810.
## 5 6 Ciliated cells 28190.
## 6 4 Airway goblet cells 11964.
## 7 7 Ciliated cells 2796.
## 8 5 Ciliated cells 1888.
## 9 10 Ciliated cells 696.
## 10 11 Ionocytes 1286.
## 11 8 Basal cells (Airway progenitor cells) 1535.
## 12 9 Ciliated cells 1557.
#Merge cell types with cluster IDs on UMAP
seurat_integrated@meta.data$customclassif = ""
for(j in unique(sctype_scores$cluster)){
cl_type = sctype_scores[sctype_scores$cluster==j,];
seurat_integrated@meta.data$customclassif[seurat_integrated@meta.data$seurat_clusters == j] = as.character(cl_type$type[1])
}
#Rename club cells
seurat_integrated@meta.data <- seurat_integrated@meta.data %>%
dplyr::mutate(across('customclassif', str_replace, 'Clara cells', 'Club cells'),
across('customclassif', str_replace, 'Airway goblet cells', 'Secretory and goblet cells'),
across('customclassif', str_replace, 'Basal cells (Airway progenitor cells)', 'Basal cells'))
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `across("customclassif", str_replace, "Clara cells", "Club
## cells")`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
Idents(seurat_integrated) <- "customclassif"
#Reorder factor levels
DimPlot(seurat_integrated, reduction = "umap", label = FALSE, repel = TRUE, group.by = 'customclassif', order = rev(c("Ciliated cells", "Secretory and goblet cells", "Basal cells (Airway progenitor cells)", "Airway epithelial cells", "Club cells", "Ionocytes"))) &
ggtitle("Integrated UMAP by cell type") &
scale_color_manual(values = c("#1080FF", "#8000FF", "#FB02FF", "#00C13B", "#48EFDA", "#CC9ECB")) &
NoLegend()
UMAP of the flow markers (except for SiR-Tubulin because there is no specific gene for microtubule production):
FeaturePlot(seurat_integrated, features = c("NGFR", "CEACAM6", "TSPAN8"))
Compare infected to uninfected condition:
#Read in ISG list
human.isgs <- read.csv("Human ISG list.csv", header = TRUE)
#Filter ISGs to those present in the seurat dataset
human.isgs$present_in_seurat <- human.isgs$isg_name %in% rownames(seurat_integrated)
present_isgs <- subset(human.isgs, present_in_seurat == "TRUE")
ciliated_flu_markers <- FindMarkers(seurat_integrated, ident.1 = "cal", ident.2 = "mock", group.by = "condition", subset.ident = "Ciliated cells")
#Filter by ISGs
ciliated_flu_isgs <- ciliated_flu_markers %>%
filter(row.names(ciliated_flu_markers) %in% c(human.isgs$isg_name) & p_val_adj < 0.05)
Compare infected vs uninfected:
secretory_flu_markers <- FindMarkers(seurat_integrated, ident.1 = "cal", ident.2 = "mock", group.by = "condition", subset.ident = "Secretory and goblet cells")
#Filter by ISGs
secretory_flu_isgs <- secretory_flu_markers %>%
filter(row.names(secretory_flu_markers) %in% c(human.isgs$isg_name) & p_val_adj < 0.05)
#Set identities of integrated object to the condition
Idents(seurat_integrated) <- "condition"
#Pull out mock dataset
mock_integrated <- subset(seurat_integrated, idents = c("mock"))
#Scale data in the RNA assays
mock_integrated <- ScaleData(mock_integrated, features = present_isgs$isg_name)
## Centering and scaling data matrix
#Set identities to cell type
Idents(mock_integrated) <- "customclassif"
#Find markers for each cell type
mock_markers <- FindAllMarkers(mock_integrated)
## Calculating cluster Airway epithelial cells
## Calculating cluster Secretory and goblet cells
## Calculating cluster Basal cells (Airway progenitor cells)
## Calculating cluster Club cells
## Calculating cluster Ciliated cells
## Calculating cluster Ionocytes
#Filter markers down to ISGs, and filter to only padj < 0.05
mock_isg_markers_all <- mock_markers %>%
filter(rownames(.) %in% present_isgs$isg_name & p_val_adj < 0.05) %>%
group_by(cluster) %>%
arrange(-avg_log2FC, .by_group = TRUE)
#Top 50 only (use for graphing)
mock_isg_markers <- mock_markers %>%
filter(rownames(.) %in% present_isgs$isg_name & p_val_adj < 0.05) %>%
group_by(cluster) %>%
slice_max(n = 50, order_by = avg_log2FC) %>%
arrange(-avg_log2FC, .by_group = TRUE)
#Reorder the cell type levels so that they will be the same between heat maps
levels(mock_integrated) <- c("Ciliated cells", "Secretory and goblet cells", "Basal cells (Airway progenitor cells)", "Airway epithelial cells", "Club cells", "Ionocytes")
#Heatmap of mock condition, cell type specific ISGs
DoHeatmap(mock_integrated, features = mock_isg_markers$gene, assay = "RNA", group.colors = c("#1080FF", "#8000FF", "#FB02FF", "#00C13B", "#48EFDA", "#CC9ECB")) &
ggtitle("ISG Heatmap of Mock-infected cells\nCombined analysis") &
labs(caption = "Cell type identities were determined with full gene set.\nThe most variable genes per cluster were found with FindAllMarkers.\nCell type specific markers were filtered to ISGs and padj < 0.05.\n50 ISGs per cluster are plotted (or as many as are there if less than 50).")
sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /common/software/install/migrated/openblas/0.3.5_gcc8.2.0_multiarch/lib/libopenblasp-r0.3.5.so; LAPACK version 3.8.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/Chicago
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] purrr_1.0.2 data.table_1.14.8 stringr_1.5.0 openxlsx_4.2.5.2
## [5] HGNChelper_0.8.1 dplyr_1.1.3 ggplot2_3.4.4 sctransform_0.4.1
## [9] SeuratObject_5.0.0 Seurat_4.3.0
##
## loaded via a namespace (and not attached):
## [1] RcppAnnoy_0.0.21 splines_4.3.0
## [3] later_1.3.1 bitops_1.0-7
## [5] tibble_3.2.1 polyclip_1.10-6
## [7] lifecycle_1.0.4 Rdpack_2.6
## [9] globals_0.16.2 lattice_0.21-8
## [11] MASS_7.3-58.4 magrittr_2.0.3
## [13] limma_3.58.1 plotly_4.10.3
## [15] sass_0.4.7 rmarkdown_2.25
## [17] plotrix_3.8-4 jquerylib_0.1.4
## [19] yaml_2.3.7 qqconf_1.3.2
## [21] httpuv_1.6.12 sn_2.1.1
## [23] glmGamPoi_1.14.0 spam_2.10-0
## [25] zip_2.3.0 sp_2.1-1
## [27] spatstat.sparse_3.0-3 reticulate_1.34.0
## [29] cowplot_1.1.1 pbapply_1.7-2
## [31] RColorBrewer_1.1-3 multcomp_1.4-25
## [33] abind_1.4-5 zlibbioc_1.48.0
## [35] Rtsne_0.16 GenomicRanges_1.54.1
## [37] BiocGenerics_0.48.1 RCurl_1.98-1.13
## [39] TH.data_1.1-2 sandwich_3.0-2
## [41] GenomeInfoDbData_1.2.11 IRanges_2.36.0
## [43] S4Vectors_0.40.1 ggrepel_0.9.4
## [45] irlba_2.3.5.1 listenv_0.9.0
## [47] spatstat.utils_3.0-4 TFisher_0.2.0
## [49] goftest_1.2-3 spatstat.random_3.2-1
## [51] fitdistrplus_1.1-11 parallelly_1.36.0
## [53] DelayedMatrixStats_1.24.0 leiden_0.4.3
## [55] codetools_0.2-19 DelayedArray_0.28.0
## [57] tidyselect_1.2.0 farver_2.1.1
## [59] matrixStats_1.1.0 stats4_4.3.0
## [61] spatstat.explore_3.2-5 mathjaxr_1.6-0
## [63] jsonlite_1.8.7 multtest_2.58.0
## [65] ellipsis_0.3.2 progressr_0.14.0
## [67] ggridges_0.5.4 survival_3.5-5
## [69] tools_4.3.0 ica_1.0-3
## [71] Rcpp_1.0.11 glue_1.6.2
## [73] mnormt_2.1.1 gridExtra_2.3
## [75] SparseArray_1.2.2 xfun_0.41
## [77] metap_1.9 mgcv_1.8-42
## [79] MatrixGenerics_1.14.0 GenomeInfoDb_1.38.1
## [81] withr_2.5.2 numDeriv_2016.8-1.1
## [83] fastmap_1.1.1 fansi_1.0.5
## [85] digest_0.6.33 R6_2.5.1
## [87] mime_0.12 colorspace_2.1-0
## [89] scattermore_1.2 tensor_1.5
## [91] spatstat.data_3.0-3 utf8_1.2.4
## [93] tidyr_1.3.0 generics_0.1.3
## [95] httr_1.4.7 htmlwidgets_1.6.2
## [97] S4Arrays_1.2.0 uwot_0.1.16
## [99] pkgconfig_2.0.3 gtable_0.3.4
## [101] lmtest_0.9-40 XVector_0.42.0
## [103] htmltools_0.5.7 dotCall64_1.1-0
## [105] scales_1.2.1 Biobase_2.62.0
## [107] png_0.1-8 knitr_1.45
## [109] rstudioapi_0.15.0 reshape2_1.4.4
## [111] nlme_3.1-162 cachem_1.0.8
## [113] zoo_1.8-12 KernSmooth_2.23-20
## [115] parallel_4.3.0 miniUI_0.1.1.1
## [117] pillar_1.9.0 grid_4.3.0
## [119] vctrs_0.6.4 RANN_2.6.1
## [121] promises_1.2.1 xtable_1.8-4
## [123] cluster_2.1.4 evaluate_0.23
## [125] mvtnorm_1.2-3 cli_3.6.1
## [127] compiler_4.3.0 rlang_1.1.2
## [129] crayon_1.5.2 future.apply_1.11.0
## [131] mutoss_0.1-13 labeling_0.4.3
## [133] plyr_1.8.9 stringi_1.8.1
## [135] viridisLite_0.4.2 deldir_1.0-9
## [137] munsell_0.5.0 lazyeval_0.2.2
## [139] spatstat.geom_3.2-7 Matrix_1.6-2
## [141] patchwork_1.1.3 sparseMatrixStats_1.14.0
## [143] future_1.33.0 statmod_1.5.0
## [145] shiny_1.7.5.1 SummarizedExperiment_1.32.0
## [147] highr_0.10 rbibutils_2.2.16
## [149] ROCR_1.0-11 igraph_1.5.1
## [151] bslib_0.5.1